Sciences Po climate terms
Frequency of articles containing a few select terms in a few select publications over time.
!pip install statsmodels
!pip install vl-convert-python
!pip install altair
!pip install pygam
Requirement already satisfied: statsmodels in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (0.14.5) Requirement already satisfied: numpy<3,>=1.22.3 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from statsmodels) (1.24.1) Requirement already satisfied: scipy!=1.9.2,>=1.8 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from statsmodels) (1.13.1) Requirement already satisfied: pandas!=2.1.0,>=1.4 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from statsmodels) (1.5.3) Requirement already satisfied: patsy>=0.5.6 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from statsmodels) (1.0.2) Requirement already satisfied: packaging>=21.3 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from statsmodels) (23.0) Requirement already satisfied: python-dateutil>=2.8.1 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2022.7.1) Requirement already satisfied: six>=1.5 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas!=2.1.0,>=1.4->statsmodels) (1.17.0) Requirement already satisfied: vl-convert-python in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (1.8.0) Requirement already satisfied: altair in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (5.5.0) Requirement already satisfied: jinja2 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from altair) (3.1.2) Requirement already satisfied: jsonschema>=3.0 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from altair) (4.17.3) Requirement already satisfied: narwhals>=1.14.2 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from altair) (2.9.0) Requirement already satisfied: packaging in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from altair) (23.0) Requirement already satisfied: typing-extensions>=4.10.0 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from altair) (4.12.2) Requirement already satisfied: attrs>=17.4.0 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from jsonschema>=3.0->altair) (24.3.0) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from jsonschema>=3.0->altair) (0.19.3) Requirement already satisfied: MarkupSafe>=2.0 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from jinja2->altair) (2.1.2) Requirement already satisfied: pygam in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (0.10.1) Requirement already satisfied: numpy>=1.5.0 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from pygam) (1.24.1) Requirement already satisfied: progressbar2<5,>=4.2.0 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from pygam) (4.5.0) Requirement already satisfied: scipy<1.17,>=1.11.1 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from pygam) (1.13.1) Requirement already satisfied: python-utils>=3.8.1 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from progressbar2<5,>=4.2.0->pygam) (3.9.1) Requirement already satisfied: typing_extensions>3.10.0.2 in /home/boogheta/.pyenv/versions/3.9.5/lib/python3.9/site-packages (from python-utils>=3.8.1->progressbar2<5,>=4.2.0->pygam) (4.12.2)
# preamble
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess
d = pd.read_csv("medias-month-breakdown.csv",parse_dates=["month"])
d.shape
(2727, 8)
d.head(1)
| media | month | total | query_climatique | query_effet_de_serre | query_biodiversite | query_transition | query_durable | |
|---|---|---|---|---|---|---|---|---|
| 0 | lacroix | 1970-01-01 | 2 | 0 | 0 | 0 | 0 | 0 |
d.value_counts("media")
media lemonde 946 lesechos 406 lacroix 350 liberation 317 leparisien 286 lefigaro 229 mediapart 193 dtype: int64
alt.Chart(d[d["media"]=="liberation"]).mark_line(point=False).encode(
x=alt.X("month:T", title="Month"),
y=alt.Y("query_biodiversite:Q", title="Count"),
tooltip=["month", "query_durable","total"]
).properties(
title="Count over Time",
width=600,
height=300
)
The following plots could help limit the horizontal extent for plots and stripes. Right now, some of the plots bend down at the very right of the data because you have only part of a month recorded. I did not have time to fix up the extremes but I think you'd either put NAs in the CSV or remove rows.
media = ['lemonde', 'lesechos', 'lacroix', 'liberation', 'leparisien','lefigaro', 'mediapart']
i = 6
chart = alt.Chart(d[d["media"]==media[i]]).mark_line(point=False).encode(
x=alt.X("month:T", title="Month"),
y=alt.Y("total:Q", title="Count"),
tooltip=["month", "query_durable","total"]
).properties(
title="Total over Time - "+media[i],
width=600,
height=300
)
# horizontal line at y = 1000
hline = alt.Chart(pd.DataFrame({'y': [500,1000]})).mark_rule(color='red', strokeDash=[5,3]).encode(
y='y'
)
chart+hline
Now, shift from wide to long format...
# wide to long
dm = d.melt(id_vars=["month","media","total"], var_name="query", value_name="count")
# tidy queries so they are just the query term cap'd
dm["query"] = dm["query"].str[6:].str.capitalize().str.replace("_"," ")
# new columns 'normalized' and 'as' for arcsin-sqrt
dm["normalized"] = dm["count"]/dm["total"]
dm["as"] = np.arcsin(np.sqrt(dm["normalized"]))
# have a look (dl for "liberation")
dl = dm[(dm["media"]=="liberation") & (dm["query"]=="Climatique")].copy()
dl
| month | media | total | query | count | normalized | as | |
|---|---|---|---|---|---|---|---|
| 2217 | 1998-01-01 | liberation | 2247 | Climatique | 4 | 0.001780 | 0.042204 |
| 2218 | 1998-02-01 | liberation | 2173 | Climatique | 4 | 0.001841 | 0.042917 |
| 2219 | 1998-03-01 | liberation | 2407 | Climatique | 7 | 0.002908 | 0.053954 |
| 2220 | 1998-04-01 | liberation | 2119 | Climatique | 3 | 0.001416 | 0.037635 |
| 2221 | 1998-05-01 | liberation | 2008 | Climatique | 8 | 0.003984 | 0.063161 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2529 | 2024-01-01 | liberation | 2177 | Climatique | 173 | 0.079467 | 0.285773 |
| 2530 | 2024-02-01 | liberation | 2155 | Climatique | 163 | 0.075638 | 0.278614 |
| 2531 | 2024-03-01 | liberation | 2520 | Climatique | 170 | 0.067460 | 0.262744 |
| 2532 | 2024-04-01 | liberation | 2571 | Climatique | 182 | 0.070790 | 0.269307 |
| 2533 | 2024-05-01 | liberation | 857 | Climatique | 79 | 0.092182 | 0.308484 |
317 rows × 7 columns
dm.value_counts("query")
query Biodiversite 2727 Climatique 2727 Durable 2727 Effet de serre 2727 Transition 2727 dtype: int64
Some plots...
alt.Chart(dm).mark_bar().encode(
x=alt.X("normalized:Q", bin=alt.Bin(maxbins=100), title="monthly fractions"),
y=alt.Y("count()", title="Count")
).properties(
title="Histogram of monthly fractions",
width=500,
height=300
)
# asin-sqrt -- the 0's will mostly go away as they are associated with 0 totals
alt.Chart(dm).mark_bar().encode(
x=alt.X("as:Q", bin=alt.Bin(maxbins=100), title="monthly fractions"),
y=alt.Y("count()", title="Count")
).properties(
title="Histogram of monthly fractions (asin-sqrt)",
width=500,
height=300
)
# normalized plot
alt.Chart(dm[(dm["month"]>="1985-01-01") & (dm["media"]=="lemonde")]).mark_line(point=False).encode(
x=alt.X("month:T", title="Month"),
y=alt.Y("normalized:Q", title="Fraction"),
tooltip=["month", "query","total"]
).facet(
facet="query:N",
columns=3
).properties(
title="Count over Time",
# width=600,
# height=300
)
# same data asinsqrt transform
alt.Chart(dm[(dm["month"]>="1985-01-01") & (dm["media"]=="liberation")]).mark_line(point=False).encode(
x=alt.X("month:T", title="Month"),
y=alt.Y("as:Q", title="Fraction"),
tooltip=["month", "query","total"]
).facet(
facet="query:N",
columns=3
).properties(
title="Count over Time (asin-sqrt)",
# width=600,
# height=300
)
# now a grid over the terms with different lines for the publiations
alt.Chart(dm[(dm["month"]>="1985-01-01") ]).mark_line(point=False).encode(
x=alt.X("month:T", title="Month"),
y=alt.Y("as:Q", title="Fraction"),
tooltip=["month", "query","total"],
color="media"
).facet(
facet="query:N",
columns=3
).properties(
title="Count over Time (asin-sqrt)",
# width=600,
# height=300
)
This is the previous smoother. In Python I couldn't find a lowess implementation that allowed for case weights as well as iterative fitting for robustness. So instead I called out to R.
dm.value_counts("media")
media lemonde 4730 lesechos 2030 lacroix 1750 liberation 1585 leparisien 1430 lefigaro 1145 mediapart 965 dtype: int64
dm.value_counts("query")
query Biodiversite 2727 Climatique 2727 Durable 2727 Effet de serre 2727 Transition 2727 dtype: int64
Smooth the data... using loess + spikes
# define the function to do the smoothing
# (later we will actually call out to r instead)
def smooth_keep_spikes(y, frac=0.10, k=3.0, it=3):
x = np.arange(len(y))
base = lowess(y, x, frac=frac, it=it, return_sorted=False)
res = y - base
mad = np.median(np.abs(res - np.median(res))) + 1e-12
sigma = 1.4826 * mad
return np.where(np.abs(res) > k*sigma, y, base)
# now smooth the data and back-transform to proportions
dl["as_loess"] = pd.Series(np.sin(smooth_keep_spikes(list(dl["as"]), frac=0.05, k=4.0))**2,index=dl.index)
Again, instead of using the smooth_keep_spikes() function in Python, let's build up moving the data to R
# housekeeping
cleanup_media_name = lambda media: media.replace("é", "e").replace("É", "E").replace(" ", "").lower()
dates = dm[["month"]][(dm["media"]=="lemonde") & (dm["month"]>="1985-01-01")]
# now bring the data to R
medias = [
"Médiapart",
"Libération",
"Le Monde",
"La Croix",
"Le Parisien",
"Les Échos",
"Le Figaro"
]
queries = [
"Climatique",
"Effet de serre",
"Biodiversité",
# "Durable",
# "Transition"
]
out = pd.DataFrame()
for media in medias:
for query in queries:
media = cleanup_media_name(media)
query = query.replace("é", "e")
d = dm[(dm["media"]==media) & (dm["query"]==query)].copy()
e = d.loc[d["month"]>="1985-01-01",["count","total"]]
e["media"] = media
e["query"] = query
out = pd.concat([out,e],ignore_index=True)
out.to_csv("out.tmp",index=False)
Go out to R and come back
# the R code
# s = 0.25
# m = 2.9
#
# # read in data
# big = read.csv("out.tmp")
#
# for(media in unique(big$media)){
# for(query in unique(big$query)){
#
# r = big[big$media==media & big$query==query,]
#
# # set up smoothing problem
# r$x = 1:nrow(r)
# r$y = asin(sqrt((r$count+0.5)/r$total))
#
# # fit using iteration to minimize outliers
# # use proper weights for proportions after arcsin-sqrt
# f = loess(r$y~r$x,family="symmetric",weights=2*r$total,span=s)
#
# # standardized residuals
# res = 2*sqrt(r$total)*(r$y-predict(f))
#
# # find the mad and estimate sigma
# mad = median(abs(res-median(res)))
# sig = 1.4826*mad
#
# # find outliers
# smooth = ifelse(abs(res)>(m*sig),r$y,predict(f))
#
# big[big$media==media & big$query==query,"smooth"] = sin(smooth)^2
#
# plot(sin(r$y)^2,type="l")
# lines(sin(smooth)^2,col="red",type="l")
# }
# }
#
# write.csv(big,file="temp_data_from_R.csv",row.names=F)
# read the smooth data back into python
out = pd.read_csv("temp_data_from_R.csv")
# updated function to build one smoothed plot
def build_smoothed_plot(media_name, query_name, w, h, first_row=True, first_col=True):
media = cleanup_media_name(media_name)
query = query_name.replace("é", "e")
d = dm[(dm["media"]==media) & (dm["query"]==query)].copy()
d = d[d["month"]>="1985-01-01"]
d = d.reset_index(drop=True)
# the old smooth
# d["as_loess"] = pd.Series(np.sin(smooth_keep_spikes(list(d["as"]), frac=0.05, k=4))**2,index=d.index)
# the new smooth
e = out["smooth"][(out["media"]==media) & (out["query"]==query)].copy()
e = e.reset_index(drop=True)
d["as_loess"] = e
p1 = alt.Chart(d).mark_line(point=False,color="red").encode(
x=alt.X("month:T", title=None, scale=alt.Scale(domain=["1985-01-01", "2024-12-01"])),
y=alt.Y("as_loess:Q", title=media_name if first_col else None, scale=alt.Scale(domain=[0, 0.2])).axis(format='%'),
tooltip=["month", "query","total"]
).properties(
title=query_name if first_row else "",
width=w,
height=h
)
p2 = alt.Chart(d).mark_line(point=False,opacity=0.2).encode(
x=alt.X("month:T", title=None, scale=alt.Scale(domain=["1985-01-01", "2024-12-01"])),
y=alt.Y("normalized:Q", title=media_name if first_col else None, scale=alt.Scale(domain=[0, 0.2])).axis(format='%'),
tooltip=["month", "query","total"]
).properties(
title=query_name if first_row else "",
width=w,
height=h
)
return p2+p1
build_smoothed_plot("liberation", "Climatique", 600, 300)
# now make a grid of small multiples across medias/queries (same code)
medias = [
"Médiapart",
"Libération",
"Le Monde",
"La Croix",
"Le Parisien",
"Les Échos",
"Le Figaro"
]
queries = [
"Climatique",
"Effet de serre",
"Biodiversité",
# "Durable",
# "Transition"
]
small_multiples = alt.vconcat()
for i, media in enumerate(medias):
query_charts = alt.hconcat()
for j, query in enumerate(queries):
print(query,media)
query_charts |= build_smoothed_plot(media, query, 400, 125, not i, not j)
small_multiples &= query_charts
small_multiples
Climatique Médiapart Effet de serre Médiapart Biodiversité Médiapart Climatique Libération Effet de serre Libération Biodiversité Libération Climatique Le Monde Effet de serre Le Monde Biodiversité Le Monde Climatique La Croix Effet de serre La Croix Biodiversité La Croix Climatique Le Parisien Effet de serre Le Parisien Biodiversité Le Parisien Climatique Les Échos Effet de serre Les Échos Biodiversité Les Échos Climatique Le Figaro Effet de serre Le Figaro Biodiversité Le Figaro
Now the variable media is used to specify the outlet and then we create four warming lines for climatique, effet de serre, durable and biodiversite.
media = "leparisien"
dl = dm[(dm["month"]>="1985-01-01") & (dm["media"]==media) & (dm["query"]=="Climatique")].copy()
dl["as_loess"] = pd.Series(np.sin(smooth_keep_spikes(list(dl["as"]), frac=0.05, k=4.0))**2,index=dl.index)
dl = dl.reset_index(names="x")
# Create the stripes
chart1 = alt.Chart(dl).mark_rect().encode(
x=alt.X("x:O", axis=None), # ordinal so stripes are equally spaced
color=alt.Color(
"as:Q",
scale=alt.Scale(scheme="redblue", reverse=True), # blue=cold, red=hot
legend=None,
),
).properties(width=1000, height=300) # removes border
dl = dm[(dm["month"]>="1985-01-01") & (dm["media"]==media) & (dm["query"]=="Effet de serre")].copy()
dl["as_loess"] = pd.Series(np.sin(smooth_keep_spikes(list(dl["as"]), frac=0.05, k=4.0))**2,index=dl.index)
dl = dl.reset_index(names="x")
# Create the stripes
chart2 = alt.Chart(dl).mark_rect().encode(
x=alt.X("x:O", axis=None), # ordinal so stripes are equally spaced
color=alt.Color(
"as:Q",
scale=alt.Scale(scheme="redblue", reverse=True), # blue=cold, red=hot
legend=None,
),
).properties(width=1000, height=300) # removes border
dl = dm[(dm["month"]>="1985-01-01") & (dm["media"]==media) & (dm["query"]=="Durable")].copy()
dl["as_loess"] = pd.Series(np.sin(smooth_keep_spikes(list(dl["as"]), frac=0.05, k=4.0))**2,index=dl.index)
dl = dl.reset_index(names="x")
# Create the stripes
chart3 = alt.Chart(dl).mark_rect().encode(
x=alt.X("x:O", axis=None), # ordinal so stripes are equally spaced
color=alt.Color(
"as:Q",
scale=alt.Scale(scheme="redblue", reverse=True), # blue=cold, red=hot
legend=None,
),
).properties(width=1000, height=300) # removes border
dl = dm[(dm["month"]>="1985-01-01") & (dm["media"]==media) & (dm["query"]=="Biodiversite")].copy()
dl["as_loess"] = pd.Series(np.sin(smooth_keep_spikes(list(dl["as"]), frac=0.05, k=4.0))**2,index=dl.index)
dl = dl.reset_index(names="x")
# Create the stripes
chart4 = alt.Chart(dl).mark_rect().encode(
x=alt.X("x:O", axis=None), # ordinal so stripes are equally spaced
color=alt.Color(
"as:Q",
scale=alt.Scale(scheme="redblue", reverse=True), # blue=cold, red=hot
legend=None,
),
).properties(width=1000, height=300) # removes border
(chart1 & chart2 & chart4 & chart3).configure_view(strokeWidth=0).resolve_scale(color="independent")#.save(media+".png")
Compare media outlets with medias-month-breakdown-total.csv
The plot for the medias-month-breakdown-total.csv file.
Should probably put a thin black line around each plot and line them up so that they are white before their start date.
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
# read in corrected data
data = pd.read_csv("medias-month-breakdown-total.csv",parse_dates=["month"])
data["frac"] = data["query_total"]/data["total"]
# cheap way to specify months starting Jan 1 1985
dates = data[["month"]][(data["media"]=="lemonde") & (data["month"]>="1985-01-01")]
data[data["media"]=="lefigaro"].head(100)
| media | month | total | query_total | frac | |
|---|---|---|---|---|---|
| 350 | lefigaro | 2004-12-01 | 84 | 0 | 0.000000 |
| 351 | lefigaro | 2005-04-01 | 2 | 0 | 0.000000 |
| 352 | lefigaro | 2005-05-01 | 3 | 0 | 0.000000 |
| 353 | lefigaro | 2005-06-01 | 3 | 0 | 0.000000 |
| 354 | lefigaro | 2005-07-01 | 23 | 0 | 0.000000 |
| ... | ... | ... | ... | ... | ... |
| 445 | lefigaro | 2013-02-01 | 12927 | 95 | 0.007349 |
| 446 | lefigaro | 2013-03-01 | 13033 | 100 | 0.007673 |
| 447 | lefigaro | 2013-04-01 | 12993 | 87 | 0.006696 |
| 448 | lefigaro | 2013-05-01 | 12855 | 127 | 0.009879 |
| 449 | lefigaro | 2013-06-01 | 12628 | 104 | 0.008236 |
100 rows × 5 columns
# code to make each stripe
def make_stripes(media_name, dates, data, w,h):
media = cleanup_media_name(media_name)
dates_media = dates.copy()
data_media = data[data["media"] == media][["month", "frac","total"]].copy()
merged = pd.merge(dates_media, data_media, on="month", how="left")
# keep order consistent
merged["month_str"] = merged["month"].astype(str)
merged["has_data"] = (~merged["frac"].isna()) & (merged["total"]>250)
merged.loc[~merged["has_data"],"frac"] = pd.NA
# locate the first and last month with data
if merged["has_data"].any():
first_idx = merged.index[merged["has_data"]].min()
last_idx = merged.index[merged["has_data"]].max()
first_month = merged.loc[first_idx, "month_str"]
last_month = merged.loc[last_idx, "month_str"]
else:
first_month = last_month = None
# main stripe layer
color_scale = alt.Scale(scheme="redblue", reverse=True)
stripes = (
alt.Chart(merged)
.mark_rect(stroke=None,width=1.25*(w)/merged.shape[0])
.encode(
x=alt.X("month_str:O", axis=None),
color=alt.condition(
alt.datum.frac != None,
alt.Color("frac:Q", scale=color_scale, legend=None),
alt.value("white")
),
)
.properties(width=w, height=h, title="%s" % media_name)
)
# outline layer
outline_df = pd.DataFrame({"x1": [first_month], "x2": [last_month]})
outline = (
alt.Chart(outline_df)
.mark_rect(fill=None, stroke="black", strokeWidth=1)
.encode(
x=alt.X("x1:O"),
x2=alt.X2("x2:O")
)
.properties(width=w, height=h)
)
chart = stripes + outline
return chart
data.value_counts("media")
media lemonde 946 lesechos 406 lacroix 350 liberation 317 leparisien 286 lefigaro 229 mediapart 193 dtype: int64
stripes0 = make_stripes("Médiapart",dates,data,800,100)
stripes1 = make_stripes("Libération",dates,data,800,100)
stripes2 = make_stripes("Le Monde",dates,data,800,100)
stripes3 = make_stripes("La Croix",dates,data,800,100)
stripes4 = make_stripes("Le Parisien",dates,data,800,100)
stripes5 = make_stripes("Les Échos",dates,data,800,100)
stripes6 = make_stripes("Le Figaro",dates,data,800,100)
chart_stack = (alt.vconcat(stripes1, stripes2, stripes3, stripes4, stripes5, stripes6)
.resolve_scale(x='shared',color="independent")
)
chart_stack
timeAxis = alt.Chart(data[(data["month"]>="1985-01-01") & (data["media"]=="lemonde")]).mark_line(point=False, opacity=0).encode(
x=alt.X("month:T", title="Date"),
y=alt.Y("frac:O", title=None, axis=None)
).properties(
width=800,
height=1
)
timeAxis
alt.vconcat(chart_stack, timeAxis).configure_view(stroke=None)